package bigdata.jobclean.parser

import bigdata.jobclean.{Datas, Job}
import org.apache.spark.sql.Row

import scala.util.matching.Regex

class qcParser extends  Serializable  {

  //  根据城市获取省份
  def getProvince(city:String):String={
    //    没有找到就返回未知
    var result = "NULL"
    var find=false
    for((province,cities) <- Datas.pc)
    {
      if( !find&& cities.contains(city)){
        result = province
        find=true
      }
    }
    //    if(!find)
    //    println("查不到省份:"+city)
    result
  }
  /*
  前程无忧数据解析
   */





  //    规则1 15元/小时 ,小时的不要
  val pattern1 = """(\d+)([元千万])/([天月年])""".r
  //    1.5-2千/月
  val pattern2 = """([\d.]+)-([\d.]+)([元千万])/(.*?)""".r
//  10万以上/月
  val pattern3 = """([\d.]+)([元千万])以[上下]/(.*?)""".r
  def cleanSalary(salary: String): Float = {
    val items: Option[Tuple3[String, String, String]] = salary match {
      case pattern1(num, unit, duration) => Some((num, unit, duration))
//        取
      case pattern2(num,num2, unit, duration) => Some((  ((num.toFloat+num2.toFloat)/2).toString , unit, duration))
      case pattern3(num, unit, duration) => Some((num, unit, duration))
      //        错误的薪资
      case _ => {
//                if(salary!=null)
//                println("错误工资:"+salary)
        None
      }
    }
    items match {
      case None => -1
      case _ => monthSalary(items.get)
    }
  }

  //  转换为月薪
  def monthSalary(salary: Tuple3[String, String, String]): Float = {
    var base = salary._1.toFloat
    base = salary._2 match {
      case "元" => base
      case "千" => base * 1000
      case "万" => base * 10000
      case _ => base
    }
    base = salary._3 match {
      //        工作其实不能直接算24小时都在工作，这里用10
//      case "小时" => base * 10 * 30
      case "天" => base * 30
      case "年" => base / 12
      case "月" => base
      case _ => base
    }
    base
  }


//公司名字
  def cleanName(name:String):String = {
    if(name==null)"NULL"
    else
    name.replace(",","，").replace("\"","")
  }

  //  不属于的返回NULL，在后面会过滤掉
  val edus = Array("初中及以下", "大专", "本科", "博士", "硕士")
  def cleanEdu(edu: String): String = {
    //    "中技","高中","中专",
    if (edus.contains(edu))
      edu
    else {
      edu match {
        case "中技" => "高中/中技/中专"
        case "中专" => "高中/中技/中专"
        case "高中" => "高中/中技/中专"
        case _ => "NULL"
      }
    }
  }

//公司名字，有些是有,和"号的
  def cleanCName(value: String):String={
    if(value!=null)
      value.replace(",","，").replace("\"","")
    else
      "NULL"
  }




  def cleanCtype(ctype:String):String={

    if(ctype==null || ctype.trim=="")"NULL"
    else
    if(ctype.contains("外资"))
      "外资"
    else
      ctype
  }

  def cleanCtrade(ctrade:String):String={
//    val ctradeList=List("计算机软件","电子技术","互联网","通信","仪器仪表","计算机服务","网络游戏","计算机硬件")
    try {
      val c=ctrade.replaceAll("[/,|，()、]","|")
//      ||( !c(0).contains("计算机") &&  !c(0).contains("电子技术")&&  !c(0).contains("互联网")&&  !c(0).contains("通信")&& !c(0).contains("仪器仪表")&&  !c(0).contains("网络游戏"))
      if(ctrade==null || ctrade.trim=="" )
        "NULL"
      else
        c
     }
    catch {
      case ex: Exception => {
        "NULL"
      }
    }
  }
  /*

统一成
  在校生/无需经验
  1-2
  3-4
  5-9
  10+
   */
  val exps = Array("无需经验", "在校生/应届生", "1年经验", "2年经验"
    , "3-4年经验", "5-7年经验", "8-9年经验", "10年以上经验")
  def cleanExp(exp: String): String = {
    exp match {
      case "无需经验" => "无需经验"
      case  "在校生/应届生"=>"无需经验"
      case "1年经验" => "1-2年"
      case "2年经验" => "1-2年"
      case "3-4年经验" => "3-4年"
      case "5-7年经验" => "5-9年"
      case "8-9年经验" => "5-9年"
      case "10年以上经验"=>"10年以上"
      case _ => "NULL"
    }
  }
//清理城市
  def cleanCity(city: String): String = {
    if (city == null)
      "NULL"
    else {
      val items = city.split("-")
      //      对于有些城市是省份的，。。。先丢弃
      if (Datas.provinces.contains(items(0))) {
        "NULL"
      }
      else
        items(0)
    }
  }

//  https://jobs.51job.com/beijing-cpq/98900133.html?s=01&t=0
//  https://jobs.51job.com/yancheng/120818124.html?s=01&t=0"
//  ?需要转义，提取id部分就行
  val urlPattern:Regex = "https://jobs.51job.com/(.*?).html\\?s=01&t=0".r
//  清理id
  def cleanUrl(url:String):String = {
    url match
    {
      case urlPattern(id)=>id.replace("/","-")
      case _=> {
        println("获取道 错误的url id"+url)
        "NULL"}
    }
  }

  //  发布时间需要满足
  val pubPattern: Regex = "(\\d+-\\d+)发布".r
  def cleanPubtime(pub: String) = {
//    有两个10月份的，过滤
    if(pub.contains("10-"))
      "NULL"
    else
    pub match {

      case pubPattern(d) => d.replace("发布", "")
      case _ => "NULL"
    }
  }

  val numPattern = "招(.*?)人".r
  def cleanNum(num:String):Int={
    num match {
      case "招若干人"=>1
      case numPattern(n)=> n.toInt
      case _ => {
//        println("人数错误:"+num)
        -1
      }
    }

  }
  //行转成Job对象
  //  cate1,cate2,city,cname,cnum,ctrade,ctype,detail,edu,exp,name,num,,pubtime,salary,url,welfare
   def parseJob(line: Row): Job = {

    val name: String = cleanName( line.getAs("name"))
    val salary: Int = cleanSalary(line.getAs("salary")).toInt
    var city: String = line.getAs("city")

    val exp: String = cleanExp(line.getAs("exp"))
    val edu: String = cleanEdu(line.getAs("edu"))
    val num: Int = cleanNum(line.getAs("num"))
    var pubtime: String = line.getAs("pubtime")
    val cname: String =  cleanCName( line.getAs("cname"))
    var ctype: String =  cleanCtype( line.getAs("ctype"))
    var ctrade: String = cleanCtrade(line.getAs("ctrade"))
    var cnum: String = line.getAs("cnum")

    val cate1: String = line.getAs("cate1")
    val cate2: String = line.getAs("cate2")
    val welfare: String = line.getAs("welfare")
    val detail:String =  line.getAs("detail")
    val url:String =  cleanUrl( line.getAs("url"))

    city = cleanCity(city)
    //      if(line.getAs("address")==null)println(province)
    //      if(city=="NULL")print(line.getAs("address"))
    pubtime = cleanPubtime(pubtime)
    val province: String = ""//省份后面再添加
    if (cnum == null) {
      cnum = "未知规模"
    }

    val job = Job(
      url,
      name,
      salary ,
      province ,
      city ,
      exp ,
      edu ,
      num ,
      pubtime ,
      cname ,
      ctype ,
      ctrade ,
      cnum ,
      cate1 ,
      cate2 ,
      welfare,detail
    )
    job

  }




}

object  qcParser
{
  val cols = Array("url","name","salary","province","city","exp","edu","num","pubtime","cname","ctype","ctrade","cnum","cate1","cate2","welfare","detail")
  //  解析工作测试
  def main(args: Array[String]): Unit = {
    val s ="15元/小时"

    val tests=Array(
      "15元/小时",
      "200元/天",
      "1.5-2千/月",
      "7-12万/月",
      "24-28.8万/年",
      "10万以上/月",
      "1.5千以下/月"
    )
    tests.foreach(s=>{
      println(s ,"\t", new qcParser().cleanSalary(s))
    })
    println(new qcParser().cleanUrl("https://jobs.51job.com/yancheng/120818124.html?s=01&t=0"))
  }
}


